Setup

Set up packages, chart themes, slack notifications etc

Load user and event data

#load the user data
user <- read_csv("../../data/user_event/mode_users.csv", 
                 col_types = list(created_at = col_datetime(),
                                  activated_at = col_datetime()) )
user <-tbl_df(user)

#load the event data
event <- read_csv("../../data/user_event/mode_user_events.csv", 
                 col_types = list(occurred_at = col_datetime())) 
event <- tbl_df(event)


#view the number of rows and cols
dim(user)
## [1] 18911     6
dim(event)
## [1] 285260      6
#glimpse the structure
glimpse(user)
## Observations: 18911
## Variables:
## $ user_id      (dbl) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
## $ created_at   (time) 2013-01-01 14:32:28, 2013-01-01 09:56:58, 2013-0...
## $ company_id   (dbl) 5373, 1877, 6135, 12910, 8966, 792, 92, 5370, 907...
## $ language     (chr) "french", "indian", "english", "english", "englis...
## $ activated_at (time) NA, NA, 2013-01-01 18:21:55, NA, 2013-01-01 05:3...
## $ state        (chr) "pending", "pending", "active", "pending", "activ...
glimpse(event)
## Observations: 285260
## Variables:
## $ user_id     (dbl) 8546, 8546, 8546, 8546, 8546, 8546, 8546, 8546, 85...
## $ occurred_at (time) 2014-05-02 13:21:16, 2014-05-02 13:21:52, 2014-05...
## $ event_type  (chr) "engagement", "engagement", "engagement", "engagem...
## $ event_name  (chr) "login", "like_message", "home_page", "like_messag...
## $ location    (chr) "Indonesia", "Indonesia", "Indonesia", "Indonesia"...
## $ device      (chr) "macbook pro", "macbook pro", "macbook pro", "macb...
#view the top few rows
datatable(head(user))

datatable(head(event))

Subset User and Event dataframes

#keep only required columns in user and event dataframes
user <-
  user %>%
  select(user_id,
         activated_at,
         state,
         language) 

event <-
  event %>%
  select(user_id,
         event_name,
         occurred_at,
         device,
         location)

Cohort by activation week

#build the cohort dataframe
cohort_df <-
  user %>%
  #only keep users that were activated May 1 2014 onwards
  filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%

  #get the week starting date as cohort
  #mutate(cohort = floor_date(activated_at, "week") + days(1)  ) %>%  #to make the week start on Monday
  mutate(cohort = floor_date(activated_at, "week") ) %>% 
  
  #join user activity(events table)
  inner_join(event,  by = c("user_id" = "user_id")) %>%
  
  #create column to store diff in weeks from activation date for each event/activity
  mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
  
  group_by(cohort) %>%
  #for each cohort calculate the longest observation window
  mutate(cutoff_age = max(period_age)) %>%
  group_by(cohort, cutoff_age, period_age) %>% 
  
  #count the no. of users with activity in each observation time point
  summarise(tally = n_distinct(user_id)) %>%
  
  #store the starting no. of users in the cohort
  mutate(first_period = max(tally)) %>%
  
  #compute retention rate for each time point
  mutate(retention = tally/first_period) %>%
  ungroup() %>%
  arrange(cohort, cutoff_age, period_age)

#change layout to wide view
cohort_visual_df <-
cohort_df %>%
  select(cohort, first_period, period_age, retention) %>%
  mutate(retention = round(retention, digits = 2)) %>%
  spread(period_age, retention) %>%
  arrange(cohort)

datatable(cohort_visual_df)

Line chart

cohort_df %>%
  filter(period_age >= 0) %>%
ggplot(aes(x = period_age, y = retention)) +
  geom_line(alpha = 0.5, size = 0.75, aes(group = cohort, color = as.character(cohort))) +
  scale_color_manual(values = tableau_color_pal("tableau20")(20)) +
  scale_y_continuous(labels = percent, limits = c(0,1)) +
  scale_x_discrete() +
  labs(x="Weeks after signup ", y = "Retention rate") +
  chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 1, hgrid = 1, mhgrid = 1) +
  legend_show(position = "right") +
  guides(color = guide_legend(title = "Week Starting"))

Heatmap

#cols = colorRampPalette(rev(brewer.pal(11, "RdYlGn")), space="Lab")
cols = colorRampPalette((brewer.pal(7, "RdYlGn")), space="Lab")

cohort_df %>%
  filter(period_age > 0) %>%
  #mutate(cohort = reorder(cohort, first_period)) %>%
ggplot(aes(x = period_age, y = cohort  )) +
  geom_tile(color = "white", size = 0.5, alpha = 0.8, aes(fill = retention)) +
  scale_fill_gradientn(colours = cols(5)) +
  geom_text(color = "black", size = 3.5, aes(x = period_age, y = cohort, label = sprintf("%1.0f%%", 100*retention) )) +
  geom_text(data = filter(cohort_df, period_age <1), color = "dark green", size = 4, fontface= "bold",
            aes(x = period_age, y = cohort, label = tally, hjust = 1)) +
  scale_y_datetime(breaks = date_breaks("1 week"), labels = date_format("%b - %d")) +
  expand_limits(x = -1) + #to accomodate the week0 text labels
  scale_x_discrete(limits = seq(1, 17, 1)) +
  labs(x="Retention rate by weeks after signup\n ", y = "Signup week") +
  chart_theme_custom_base(fsize = 14, vgrid = 0, hgrid = 0, font_family = "") +
  theme(axis.text.y=element_text(vjust = 1)) +
  legend_show(size_label_font = 10, position = "bottom") +
  guides(fill = guide_colorbar( barwidth = 10, barheight = 0.5, title.vjust = 1, title = "Retention Rate" ))

Cohort by user language

#build the cohort dataframe
cohort_df <-
  user %>%
  #only keep users that were activated May 1 2014 onwards
  filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%
  
  #set the user language as cohort
  mutate(cohort = language) %>% 
  
  #join user activity(events table)
  inner_join(event, by = c("user_id" = "user_id")) %>%
  
  #create column to store diff in weeks from activation date for each event/activity
  mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
  
  group_by(cohort) %>%
  
  #for each cohort calculate the longest observation window
  mutate(cutoff_age = max(period_age)) %>%
  group_by(cohort, cutoff_age, period_age) %>% 
  
  #count the no. of users with activity in each observation time point
  summarise(tally = n_distinct(user_id)) %>%
  
  #store the starting no. of users in the cohort
  mutate(first_period = max(tally)) %>%
  #compute retention rate for each time point
  mutate(retention = tally/first_period) %>%
  ungroup() %>%
  arrange(cohort, cutoff_age, period_age)

#change layout
cohort_visual_df <-
cohort_df %>%
  select(cohort, first_period, period_age, retention) %>%
  mutate(retention = round(retention, digits = 2)) %>%
  spread(period_age, retention) %>%
  arrange(cohort)

datatable(cohort_visual_df)

Line chart

cohort_df %>%
  filter(period_age >= 0) %>%
ggplot(aes(x = period_age, y = retention)) +
  geom_line(alpha = 0.8, size = 0.5, aes(group = cohort, color = cohort)) +
  
  scale_color_manual(values = tableau_color_pal("tableau20")(20)) +
  
  scale_y_continuous(labels = percent, limits = c(0,1)) +
  scale_x_discrete() +
  labs(x="Weeks after signup ", y = "Retention rate") +
  chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 1, hgrid = 1) +
  legend_show(position = "right", show_title = 0) +
  legend_size(symbol_size = 1)

Heatmap

#cols = colorRampPalette(rev(brewer.pal(11, "RdYlGn")), space="Lab")
cols = colorRampPalette((brewer.pal(7, "RdYlGn")), space="Lab")

cohort_df %>%
  filter(period_age > 0) %>%
  mutate(cohort = reorder(cohort, first_period)) %>%
ggplot(aes(x = period_age, y = cohort  )) +
  geom_tile(color = "white", size = 0.5, alpha = 0.8, aes(fill = retention)) +
  scale_fill_gradientn(colours = cols(5)) +
  geom_text(color = "black", size = 3.5, aes(x = period_age, y = cohort, label = sprintf("%1.0f%%", 100*retention) )) +
  geom_text(data = filter(cohort_df, period_age <1), color = "dark green", size = 4, fontface= "bold",
            aes(x = period_age, y = cohort, label = tally, hjust = 1)) +
  
  expand_limits(x = -1) + #to accomodate the week0 text labels
  scale_x_discrete(limits = seq(1, 17, 1)) +
  labs(x="Retention rate by weeks after signup ", y = "Language") +
  chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 0, hgrid = 0) 

Cohort by user device(used for signup)

#see the different status codes. We are looking for signup completion
freq_dist(event, event_name) %>% print(n = 50)
## Source: local data frame [21 x 3]
## 
##                event_name     n  perc
## 1               home_page 76239 26.7%
## 2            like_message 48124 16.9%
## 3              view_inbox 45349 15.9%
## 4                   login 33854 11.9%
## 5            send_message 26882  9.4%
## 6     search_autocomplete 15996  5.6%
## 7              search_run 11679  4.1%
## 8             create_user  7159  2.5%
## 9             enter_email  4303  1.5%
## 10             enter_info  3812  1.3%
## 11        complete_signup  3631  1.3%
## 12  search_click_result_2  1182  0.4%
## 13  search_click_result_1  1165  0.4%
## 14  search_click_result_4  1068  0.4%
## 15  search_click_result_3  1019  0.4%
## 16  search_click_result_5   855  0.3%
## 17  search_click_result_9   688  0.2%
## 18  search_click_result_6   682  0.2%
## 19  search_click_result_7   568  0.2%
## 20  search_click_result_8   562  0.2%
## 21 search_click_result_10   443  0.2%
#get list of users who completed signup
users_signed_up <-
  event %>%
  filter(event_name == "complete_signup") %>%
  #set the user language as cohort
  mutate(cohort = device) %>%
  filter(!is.na(cohort)) %>%
  select(user_id, cohort)

#build the cohort dataframe
cohort_df <-
  user %>%
  #only keep users that were activated May 1 2014 onwards
  filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%
  
  #join users who competed sign up 
  inner_join(users_signed_up, by = c("user_id" = "user_id")) %>%
  #join user activity(events table)
  inner_join(event, by = c("user_id" = "user_id")) %>%
  
  #create column to store diff in weeks from activation date for each event/activity
  mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
  
  group_by(cohort) %>%
  
  #for each cohort calculate the longest observation window
  mutate(cutoff_age = max(period_age)) %>%
  group_by(cohort, cutoff_age, period_age) %>% 
  
  #count the no. of users with activity in each observation time point
  summarise(tally = n_distinct(user_id)) %>%
  
  #store the starting no. of users in the cohort
  mutate(first_period = max(tally)) %>%
  #compute retention rate for each time point
  mutate(retention = tally/first_period) %>%
  ungroup() %>%
  arrange(cohort, cutoff_age, period_age)

#change layout
cohort_visual_df <-
cohort_df %>%
  select(cohort, first_period, period_age, retention) %>%
  mutate(retention = round(retention, digits = 2)) %>%
  spread(period_age, retention) %>%
  arrange(desc(first_period))

datatable(cohort_visual_df)

Line chart

cohort_df %>%
  filter(period_age >= 0) %>%
ggplot(aes(x = period_age, y = retention)) +
  geom_line(alpha = 0.8, size = 0.5, aes(group = cohort, colour = cohort)) +
  scale_y_continuous(labels = percent, limits = c(0,1)) +
  scale_x_discrete() +
  labs(x="Weeks after signup ", y = "Retention rate") +
  chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 1, hgrid = 1, mhgrid = 1) +
  legend_show(position = "right", show_title = 0) +
  legend_size(symbol_size = 1) 

Heatmap

#cols = colorRampPalette(rev(brewer.pal(11, "RdYlGn")), space="Lab")
cols = colorRampPalette((brewer.pal(7, "RdYlGn")), space="Lab")
blues = brewer.pal(9, "Blues")


cohort_df %>%
  filter(period_age > 0) %>%
  mutate(cohort = reorder(cohort, first_period)) %>%
ggplot(aes(x = period_age, y = cohort  )) +
  geom_tile(color = "white", size = 0.5, alpha = 0.8, aes(fill = retention)) +
  scale_fill_gradientn(colours = cols(5)) +
  geom_text(color = "black", size = 3, aes(x = period_age, y = cohort, label = sprintf("%1.0f%%", 100*retention) )) +
  geom_text(data = filter(cohort_df, period_age <1), color = "dark green", size = 4, fontface= "bold",
            aes(x = period_age, y = cohort, label = tally, hjust = 1)) +
  
  expand_limits(x = -1) + #to accomodate the week0 text labels
  scale_x_discrete(limits = seq(1, 17, 1)) +
  labs(x="Retention rate by weeks after signup ", y = "Device used for signup") +
  chart_theme_custom_base(fsize = 14, background_grey = 0, vgrid = 0, hgrid = 0) 

Layer cake chart

#build the cohort dataframe
cohort_layer_df <-
  user %>%
  #only keep users that were activated May 1 2014 onwards
  filter(!is.na(activated_at) & activated_at >= '2014-05-01') %>%

  #get the week starting date as cohort
  #mutate(cohort = floor_date(activated_at, "week") + days(1)  ) %>%  #to make the week start on Monday
  mutate(cohort = floor_date(activated_at, "week") ) %>% 
  
  #join user activity(events table)
  inner_join(event,  by = c("user_id" = "user_id")) %>%
  
  #create column to store diff in weeks from activation date for each event/activity
  mutate(period_age = floor(day( seconds_to_period(occurred_at - activated_at))/7) ) %>%
  mutate(week = floor_date(occurred_at, "week" )) %>%
  #group_by(cohort) %>%
  #for each cohort calculate the longest observation window
  #mutate(cutoff_age = max(period_age)) %>%
  #group_by(cohort, cutoff_age, period_age) %>% 
  group_by(cohort, week) %>% 
  #count the no. of users with activity in each observation time point
  summarise(tally = n_distinct(user_id)) 
  

#change layout to wide view
cohort_layer_visual_df <-
cohort_layer_df %>%
  spread(week, tally) %>%
  arrange(cohort)

datatable(cohort_layer_visual_df)

#build the layered cohort chart
# setup the color palette
blues = colorRampPalette((brewer.pal(5, "Blues")), space="Lab")

cohort_layer_visual_df %>%
  gather(week, tally, -cohort) %>%
  mutate(tally=replace(tally, is.na(tally), 0)) %>%
  mutate(week = ymd(week)) %>%
  filter(week < "2014-08-30") %>%
  mutate(cohort = as.factor(cohort)) %>%
ggplot(aes(x = week, y = tally,  fill = cohort, group = cohort)) +
  geom_area(color = "white", size = 0.01, aes(fill = cohort), alpha = 1) +
  chart_theme_custom_base(mhgrid = 1, mvgrid = 1) +
  legend_show(position = "right", show_title = 0, size_label_font = 10) +
  scale_x_datetime() +
  scale_fill_manual(values = blues(19)) +
  guides(fill =  guide_legend(reverse = TRUE)) +
  labs(x="Weeks", y = "Number of users active in a week") 

Test area

d1 <- ymd_hms("2014-06-01 13:35:52")
d1
## [1] "2014-06-01 13:35:52 UTC"
d2 <- ymd_hms("2014-06-03 13:35:52")
d2
## [1] "2014-06-03 13:35:52 UTC"
d1 - d2
## Time difference of -2 days
ratio <- .51
sprintf("%1.0f%%", 100*ratio)
## [1] "51%"
col_pal = brewer.pal(3, "Set2")

#display.brewer.all() #RdYIGn
# View 5 colors for the Set2 palette
display.brewer.pal(7,"RdYlGn")

col_pal = brewer.pal(7, "RdYlGn")
col_pal[1]
## [1] "#D73027"
display.brewer.pal(9,"Blues")

blues = brewer.pal(9, "Blues")
col_pal[1]
## [1] "#D73027"